RESUME DES VARIABLES D’INTERET:

data_HEV = read.csv2("HEV_metadonneesFiltrees.csv")
head(data_HEV[,1:5 ])
##           ID  ACCESSION                    SOURCE                  ORGANISM
## 1 2574440105 OR224867\n         Hepeviridae sp.\n         Hepeviridae sp.\n
## 2 2553477909 MZ751062\n Paslahepevirus balayani\n Paslahepevirus balayani\n
## 3 2553477905 MZ751061\n Paslahepevirus balayani\n Paslahepevirus balayani\n
## 4 2550033136 LC704570\n Paslahepevirus balayani\n Paslahepevirus balayani\n
## 5 2550033132 LC704569\n Paslahepevirus balayani\n Paslahepevirus balayani\n
## 6 2528474488 OP909736\n         Hepeviridae sp.\n         Hepeviridae sp.\n
##                                                                AUTHORS
## 1                                        Liu,T., Yuan,D. and Wang,L.\n
## 2                    Liang,C., Zhao,C., Huang,W., Wang,Y. and She,R.\n
## 3                    Liang,C., Zhao,C., Huang,W., Wang,Y. and She,R.\n
## 4                 Takahashi,M., Kunita,S., Nishizawa,T., Ohnishi,H.,\n
## 5                 Takahashi,M., Kunita,S., Nishizawa,T., Ohnishi,H.,\n
## 6 Klink,P., Harms,D., Altmann,B., Dorffel,Y., Morgera,U., Zander,S.,\n
head(data_HEV[,6:10 ])
##                                                                TITLE
## 1                                                Direct Submission\n
## 2                                                Direct Submission\n
## 3                                                Direct Submission\n
## 4 Infection Dynamics and Genomic Mutations of Hepatitis E Virus in\n
## 5 Infection Dynamics and Genomic Mutations of Hepatitis E Virus in\n
## 6  Molecular characterisation of a rabbit Hepatitis E Virus strain\n
##           Assembly.Method Sequencing.Technology Location.Qualifiers
## 1 MobaXterm v. 2022.1.6\n              PacBio\n           1..7296\n
## 2       spades v. 3.0.1\n            Illumina\n           1..7331\n
## 3      spades v. v3.0.1\n            Illumina\n           1..7311\n
## 4                    <NA>                  <NA>           1..7247\n
## 5                    <NA>                  <NA>           1..7246\n
## 6         bwa v. 0.7.15\n            Illumina\n           1..7282\n
##                     genotype
## 1                       <NA>
## 2                       <NA>
## 3                       <NA>
## 4 genotype 3 (subtype 3b)"\n
## 5 genotype 3 (subtype 3b)"\n
## 6                       <NA>
data_HEV$ORF = factor(data_HEV$ORF)
for (i in c("Assembly.Method", "Sequencing.Technology", "genotype", "isolation_source", "host", "country")){
  data_HEV[, i] = factor(data_HEV[, i])
}
summary(data_HEV[, c("Assembly.Method", "Sequencing.Technology", "genotype", "isolation_source", "host", "country", "ORF")])
##          Assembly.Method                          Sequencing.Technology
##  Tanoti v. 1\n     : 40   Sanger dideoxy sequencing\n          :144    
##  Geneious v. 2021\n: 24   Illumina\n                           :115    
##  BioEdit v. 7.2.5\n:  8   Illumina; Sanger dideoxy sequencing\n: 20    
##  Bowtie v. 2\n     :  8   IonTorrent\n                         :  6    
##  IDBA-UD v. 1.1.1\n:  8   454\n                                :  4    
##  (Other)           : 90   (Other)                              : 13    
##  NA's              :898   NA's                                 :774    
##                  genotype     isolation_source             host    
##  genotype 3\n          :108   "blood" :297     "Homo sapiens":488  
##  genotype 3"\n         : 24   "liver" : 82     "swine"       : 44  
##  genotype 4"\n         : 14   "feces" : 46     "rabbit"      : 39  
##  genotype 4, isolate:\n: 10   "serum" : 42     "Sus scrofa"  : 20  
##  genotype 3, isolate:\n:  7   "plasma": 41     "wild boar"   : 18  
##  (Other)               : 91   (Other) :198     (Other)       :201  
##  NA's                  :822   NA's    :370     NA's          :266  
##                       country        ORF     
##  "France"                 :316   "ORF4":  2  
##  "China"                  :116   ORF1  :236  
##  "Japan:Hokkaido, Sapporo": 78   ORF2  :  1  
##  "Japan"                  : 27   ORF3  :  5  
##  "Japan:Tokyo"            : 27   NA's  :832  
##  "Australia"              : 26               
##  (Other)                  :486

VISUALISATION DES DONNEES:

library(plotly)

data_HEV %>%
  group_by(Annee) %>%
  summarise(SequenceCount = n()) %>%
  plot_ly(x = ~Annee, y = ~SequenceCount, type = "scatter", mode = "lines+markers", text = ~SequenceCount) %>%
  layout(title = "Évolution du Nombre de Séquences au Fil des Années",
         xaxis = list(title = "Année", tickmode = "linear", tick0 = min(data_HEV$Annee), dtick = 1),
         yaxis = list(title = "Nombre de Séquences"),
         hovermode = "closest")
data_HEV %>%
  group_by(country) %>%
  summarise(Count = n()) %>%
  plot_geo(locations = ~country, z = ~Count, locationmode = "country names") %>%
  layout(title = "Répartition des Données par Pays")
data_HEV %>%
  group_by(ORF) %>%
  summarise(Count = n()) %>%
  plot_ly(labels = ~ORF, values = ~Count, type = "pie") %>%
  layout(title = "Distribution ORF")
data_HEV %>%
  group_by(Sequencing.Technology) %>%
  summarise(Count = n()) %>%
  plot_ly(labels = ~Sequencing.Technology, values = ~Count, type = "pie") %>%
  layout(title = "Distribution sequences en fonction des Technologies de Sequenca")
data_HEV %>%
  group_by(ORF) %>%
  summarise(Count = n()) %>%
  plot_ly(x = ~ORF, y = ~Count, type = "bar") %>%
  layout(title = "Repartition des sequences en fonction des ORF")
## Warning: Ignoring 1 observations
data_HEV %>%
  group_by(SOURCE) %>%
  summarise(Count = n()) %>%
  plot_ly(x = ~SOURCE, y = ~Count, type = "bar") %>%
  layout(title = "Repartition des sequences en fonction des souches")
data_HEV %>%
  group_by(isolation_source) %>%
  summarise(Count = n()) %>%
  plot_ly(x = ~isolation_source, y = ~Count, type = "bar", color = "Yellow") %>%
  layout(title = "Repartition par source d'isolation")
## Warning: Ignoring 1 observations
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# Chargement des bibliothèques nécessaires
library(ggplot2)
library(plotly)
library(dplyr)

# Création d'un dataframe regroupé par Localisation pour la création du graphique interactif
data_HEV_grouped <- data_HEV %>%
  group_by(Location.Qualifiers) %>%
  summarise(Nombre_de_Sequences = n())

# Création d'un diagramme de dispersion interactif avec les coordonnées génomiques sur l'axe des ordonnées
plot_ly(data = data_HEV, x = ~Sequence_Length, y = ~Location.Qualifiers, type = "scatter", mode = "markers", 
        marker = list(size = ~data_HEV_grouped$Nombre_de_Sequences)) %>%
  layout(title = "Diagramme de dispersion des coordonnées génomiques, des longueurs séquences",
         xaxis = list(title = "Longueur de la séquence"),
         yaxis = list(title = "Coordonnées génomiques"),
         showlegend = FALSE)